## Boix 2008 WP

setwd("/Users/ranjitlall/Documents/Ranjit's work/Harvard/Missing data in CIPE/Drafts/PA/Revisions Jun 2016/Replication materials/Boix 2008 WP")
library(foreign)
library(Amelia)

## Load original dataset
b2008 <- read.dta("B2008 WP Rep Data.dta")
head(b2008)
dim(b2008)

## Drop ID vars
b2008$abbreviation <- b2008$ccode <-b2008$imfcode <-b2008$id <- b2008$isocode <- b2008$firstclass_mail_17digits <- b2008$namecountry <- b2008$allmail_17digits <- NULL

## Drop derived dummy vars
b2008$deceni <- b2008$interwar <- b2008$postwar <- b2008$firstwave <- b2008$twenties <- b2008$thirties <-b2008$forties <-b2008$fifties <-b2008$sixties <-b2008$seventies <-b2008$eighties <- b2008$twenties <- b2008$forties <- b2008$thirties <- b2008$fifties <- b2008$america <- b2008$europe <- b2008$Africa <- b2008$asia <- b2008$oceania <- NULL

## Drop calculated parameters
b2008$typeii_error <- b2008$rdem10yr <-b2008$rmyd10yr <- b2008$inter_1 <-b2008$inter_10 <- <-b2008$seven_year_average <-b2008$seven_year_total <- b2008$AVERAGEGINIADJ <- b2008$iscstandrdzscore <- NULL

## Drop vars with no obs
b2008$region <- b2008$firstclass_mail <- b2008$free_of_blackmarket_rate <- b2008$guerilla_warfare <- b2008$percent_gdp_ind_activity <- b2008$secondaryschool_enr <-b2008$telegraph_miles <-b2008$percent_gdp_ind_activity <-b2008$exports_a <- b2008$population_density_empire <- b2008$voter_turnout <- NULL

## Drop interpolated vars
b2008$ycap <- b2008$ycapt1 <- NULL

## How many variables? 417: reduction necessary
dim(b2008)

## Which variables are in analysis and have missing data?
sum(is.na(b2008 $waronset_updated))/nrow(b2008)*100
sum(is.na(b2008 $cwar_updatedt1))/nrow(b2008)*100
sum(is.na(b2008 $cwar_updated))/nrow(b2008)*100
sum(is.na(b2008 $avgff_vanhanent1))/nrow(b2008)*100
sum(is.na(b2008 $cwaravgff_vanhanent1))/nrow(b2008)*100
sum(is.na(b2008 $avgiod_vanhanent1))/nrow(b2008)*100
sum(is.na(b2008 $cwaravgiod_vanhanent1))/nrow(b2008)*100
sum(is.na(b2008 $avgffiod_vanhanent1))/nrow(b2008)*100
sum(is.na(b2008 $cwaravgffiod_vanhanent1))/nrow(b2008)*100
sum(is.na(b2008 $loginct1))/nrow(b2008)*100
sum(is.na(b2008 $cwarloginct1))/nrow(b2008)*100
sum(is.na(b2008 $logpopulation))/nrow(b2008)*100
sum(is.na(b2008 $cwarlogpopulationt1))/nrow(b2008)*100
sum(is.na(b2008 $democracyt1))/nrow(b2008)*100
sum(is.na(b2008 $cwardemocracyt1))/nrow(b2008)*100
sum(is.na(b2008 $cwar_updatedt1))/nrow(b2008)*100
sum(is.na(b2008 $sovereign))/nrow(b2008)*100
sum(is.na(b2008 $year))/nrow(b2008)*100 ## N
sum(is.na(b2008 $country))/nrow(b2008)*100 ## N
sum(is.na(b2008 $dummyguerrilla))/nrow(b2008)*100
sum(is.na(b2008 $guerrillaavgff_vanhanent1))/nrow(b2008)*100
sum(is.na(b2008 $guerrillaavgiod_vanhanent1))/nrow(b2008)*100
sum(is.na(b2008 $guerrillaavgffiod_vanhanent1))/nrow(b2008)*100
sum(is.na(b2008 $guerrillaloginct1))/nrow(b2008)*100
sum(is.na(b2008 $guerrillalogpopulationt1))/nrow(b2008)*100
sum(is.na(b2008 $guerrillademocracyt1))/nrow(b2008)*100
sum(is.na(b2008 $dummyguerrillat1))/nrow(b2008)*100
sum(is.na(b2008 $dummyrevolutions))/nrow(b2008)*100
sum(is.na(b2008 $revolutavgff_vanhanent1))/nrow(b2008)*100
sum(is.na(b2008 $revolutavgiod_vanhanent1))/nrow(b2008)*100
sum(is.na(b2008 $revolutavgiodff_vanhanent1))/nrow(b2008)*100
sum(is.na(b2008 $revolutloginct1))/nrow(b2008)*100
sum(is.na(b2008 $revolutlogpopulation))/nrow(b2008)*100
sum(is.na(b2008 $revolutdemocracyt1))/nrow(b2008)*100
sum(is.na(b2008 $dummyrevolutionst1))/nrow(b2008)*100

analysis <- as.data.frame(cbind(b2008$waronset_updated, b2008$cwar_updatedt1, b2008$cwar_updated, b2008$avgff_vanhanent1, b2008$cwaravgff_vanhanent1, b2008$avgiod_vanhanent1, b2008$cwaravgiod_vanhanent1, b2008$avgffiod_vanhanent1, b2008$cwaravgffiod_vanhanent1, b2008$loginct1, b2008$cwarloginct1, b2008$logpopulation, b2008$cwarlogpopulationt1, b2008$democracyt1, b2008$cwardemocracyt1, b2008$cwar_updatedt1, b2008$sovereign, b2008$dummyguerrilla, b2008$guerrillaavgff_vanhanent1, b2008$guerrillaavgiod_vanhanent1, b2008$guerrillaavgffiod_vanhanent1, b2008$guerrillaloginct1, b2008$guerrillalogpopulationt1, b2008$guerrillademocracyt1, b2008$dummyguerrillat1, b2008$dummyrevolutions, b2008$revolutavgff_vanhanent1, b2008$revolutavgiod_vanhanent1, b2008$revolutavgiodff_vanhanent1, b2008$revolutloginct1, b2008$revolutlogpopulation, b2008 $revolutdemocracyt1, b2008$dummyrevolutionst1))

missing <-as.data.frame(cbind(as.integer(complete.cases(b2008$waronset_updated)), as.integer(complete.cases(b2008$cwar_updatedt1)), as.integer(complete.cases(b2008$cwar_updated)), as.integer(complete.cases(b2008$avgff_vanhanent1)), as.integer(complete.cases(b2008$cwaravgff_vanhanent1)), as.integer(complete.cases(b2008$avgiod_vanhanent1)), as.integer(complete.cases(b2008$cwaravgiod_vanhanent1)), as.integer(complete.cases(b2008$avgffiod_vanhanent1)), as.integer(complete.cases(b2008$cwaravgffiod_vanhanent1)), as.integer(complete.cases(b2008$loginct1)), as.integer(complete.cases(b2008$cwarloginct1)), as.integer(complete.cases(b2008$logpopulation)), as.integer(complete.cases(b2008$cwarlogpopulationt1)), as.integer(complete.cases(b2008$democracyt1)), as.integer(complete.cases(b2008$cwardemocracyt1)), as.integer(complete.cases(b2008$cwar_updatedt1)), as.integer(complete.cases(b2008$sovereign)), as.integer(complete.cases(b2008$dummyguerrilla)), as.integer(complete.cases(b2008$guerrillaavgff_vanhanent1)), as.integer(complete.cases(b2008$guerrillaavgiod_vanhanent1)), as.integer(complete.cases(b2008$guerrillaavgffiod_vanhanent1)), as.integer(complete.cases(b2008$guerrillaloginct1)), as.integer(complete.cases(b2008$guerrillalogpopulationt1)), as.integer(complete.cases(b2008$guerrillademocracyt1)), as.integer(complete.cases(b2008$dummyguerrillat1)), as.integer(complete.cases(b2008$dummyrevolutions)), as.integer(complete.cases(b2008$revolutavgff_vanhanent1)), as.integer(complete.cases(b2008$revolutavgiod_vanhanent1)), as.integer(complete.cases(b2008$revolutavgiodff_vanhanent1)), as.integer(complete.cases(b2008$revolutloginct1)), as.integer(complete.cases(b2008$revolutlogpopulation)), as.integer(complete.cases(b2008$revolutdemocracyt1)), as.integer(complete.cases(b2008$dummyrevolutionst1))))

dim(analysis)
dim(missing)
apply(missing, 2, sd)

## Which vars have less than 25% missing data?
colSums(is.na(b2008))/nrow(b2008)*100

include <- c("waronset_updated", "cwar_updatedt1", "cwar_updated", "avgff_vanhanent1", "cwaravgff_vanhanent1", "avgiod_vanhanent1", "cwaravgiod_vanhanent1", "avgffiod_vanhanent1", "cwaravgffiod_vanhanent1", "loginct1", "cwarloginct1", "logpopulation", "cwarlogpopulationt1", "democracyt1", "cwardemocracyt1", "cwar_updatedt1", "sovereign", "dummyguerrilla", "guerrillaavgff_vanhanent1", "guerrillaavgiod_vanhanent1", "guerrillaavgffiod_vanhanent1", "guerrillaloginct1", "guerrillalogpopulationt1", "guerrillademocracyt1", "dummyguerrillat1", "dummyrevolutions", "revolutavgff_vanhanent1", "revolutavgiod_vanhanent1", "revolutavgiodff_vanhanent1", "revolutloginct1", "revolutlogpopulation", "revolutdemocracyt1", "dummyrevolutionst1", "country", "year")

b2008 <- b2008[include]

## Imputation
dim(b2008)

## What is average percentage of missing data?
NAs <- function(x) {
    as.vector(apply(x, 2, function(x) length(which(is.na(x)))))
    }
NAs(b2008)
mean(NAs(b2008)/nrow(b2008))*100

## Thus: 67 imputations

set.seed(02138)
b2008.out <- amelia(b2008, m = 67, cs = "country", ts = "year", polytime = 3, lags = c("waronset_updated", "cwar_updated", "dummyguerrilla", "dummyrevolutions", "avgffiod_vanhanent1", "cwaravgffiod_vanhanent1", "guerrillaavgffiod_vanhanent1", "revolutavgiodff_vanhanent1"), empri = 0.01*nrow(b2008))

write.amelia(obj= b2008.out, file.stem = "B2008 WP Imp Data", format = "dta", separate = FALSE)
